Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 0bf63df

Browse files
Cleanup batchmatmul benchmark
This commit refactors the batchmatmul benchmark, saves the best options found by the tuner and sets up the Caffe2/ATen/CUBLAS/CUDNN baselines for future reproducibility.
1 parent 93f7cb4 commit 0bf63df

File tree

2 files changed

+133
-85
lines changed

2 files changed

+133
-85
lines changed

tc/benchmarks/batchmatmul.cc

Lines changed: 73 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16+
#include "batchmatmul.h"
17+
1618
#include <iostream>
1719
#include <string>
1820
#include <vector>
@@ -43,23 +45,22 @@ DEFINE_uint32(M, 72, "M dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)");
4345
DEFINE_uint32(K, 26, "K dimension in Z(b, n, m) += X(b, n, kk) * Y(b, kk, m)");
4446

4547
class BatchMatMul : public Benchmark {
48+
protected:
49+
uint32_t B, N, M, K;
50+
4651
public:
47-
void runBatchMatMul(
48-
uint32_t B,
49-
uint32_t N,
50-
uint32_t M,
51-
uint32_t K,
52-
const tc::CudaMappingOptions& options,
53-
bool use_flags = false);
52+
void Init(uint32_t b, uint32_t n, uint32_t m, uint32_t k) {
53+
B = b;
54+
N = n;
55+
M = m;
56+
K = k;
57+
}
58+
void runBatchMatMul(const tc::CudaMappingOptions& options);
59+
void runCaffe2BatchMatMul();
60+
void runATenBatchMatMul();
5461
};
5562

56-
void BatchMatMul::runBatchMatMul(
57-
uint32_t B,
58-
uint32_t N,
59-
uint32_t M,
60-
uint32_t K,
61-
const tc::CudaMappingOptions& options,
62-
bool use_flags) {
63+
void BatchMatMul::runBatchMatMul(const tc::CudaMappingOptions& options) {
6364
at::Tensor X = at::CUDA(at::kFloat).rand({B, N, M});
6465
at::Tensor Y = at::CUDA(at::kFloat).rand({B, M, K});
6566

@@ -85,96 +86,83 @@ def batch_matmul(float(B, N, M) X, float(B, M, K) Y) -> (Z) {
8586
std::string suffix = std::string("_B_") + std::to_string(FLAGS_B) +
8687
std::string("_K_") + std::to_string(FLAGS_K) + std::string("_M_") +
8788
std::to_string(FLAGS_M) + std::string("_N_") + std::to_string(FLAGS_N);
88-
if (use_flags && FLAGS_validate_proto) {
89-
validateProto(
89+
std::vector<tc::CudaMappingOptions> bestOptions{options};
90+
if (FLAGS_autotune) {
91+
bestOptions = autotune(
9092
FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_cache") +
9193
suffix,
94+
FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_best") +
95+
suffix,
9296
tc,
9397
"batch_matmul",
9498
inputs,
99+
options,
95100
check_fun);
96-
} else {
97-
Check(tc, "batch_matmul", options, inputs, check_fun);
98-
if (use_flags) {
99-
autotune(
100-
FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_cache") +
101-
suffix,
102-
FLAGS_save_tuner_proto_prefix + std::string("/batchmatmul_best") +
103-
suffix,
104-
tc,
105-
"batch_matmul",
106-
inputs,
107-
options,
108-
check_fun);
109-
}
110101
}
102+
Check(tc, "batch_matmul", bestOptions[0], inputs, check_fun);
111103
}
112104

113-
TEST_F(BatchMatMul, TransposedBatchMatMul) {
114-
auto B = FLAGS_B;
115-
auto N = FLAGS_N;
116-
auto M = FLAGS_M;
117-
auto K = FLAGS_K;
118-
auto options = tc::CudaMappingOptions::makeNaiveMappingOptions()
119-
.tile(1)
120-
.mapToThreads({128})
121-
.mapToBlocks({B})
122-
.useSharedMemory(true)
123-
.usePrivateMemory(true)
124-
.unroll(256);
125-
runBatchMatMul(B, N, M, K, options, true);
126-
}
127-
128-
TEST_F(BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
129-
uint32_t B = 500;
130-
uint32_t K = 26;
131-
uint32_t M = 72;
132-
uint32_t N = 26;
133-
auto options = tc::CudaMappingOptions::makeNaiveMappingOptions()
134-
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
135-
.outerScheduleAllowSkewing(false)
136-
.outerSchedulePositiveOrthant(true)
137-
.intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
138-
.intraTileScheduleAllowSkewing(false)
139-
.intraTileSchedulePositiveOrthant(true)
140-
.tile(3)
141-
.mapToThreads(4, 36, 3)
142-
.mapToBlocks(512)
143-
.unroll(64)
144-
.tileImperfectlyNested(false)
145-
.useSharedMemory(true)
146-
.usePrivateMemory(false)
147-
.unrollCopyShared(true)
148-
.matchLibraryCalls(true);
149-
runBatchMatMul(B, N, M, K, options);
105+
void BatchMatMul::runCaffe2BatchMatMul() {
106+
Workspace w_ref;
107+
auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float>;
108+
AddInput(w_ref, {B, N, M}, "X");
109+
AddInput(w_ref, {B, M, K}, "Y");
110+
OperatorDef ref_def =
111+
MakeOperatorDef<caffe2::CUDABackend>("BatchMatMul", {"X", "Y"}, {"Z"});
112+
std::unique_ptr<OperatorBase> net(CreateOperator(ref_def, &w_ref));
113+
Reference([&]() { return true; }, [&](bool flag) { net->Run(); });
150114
}
151115

152-
TEST_F(BatchMatMul, ATenTransposedBatchMatMulReference) {
153-
auto B = FLAGS_B;
154-
auto N = FLAGS_N;
155-
auto M = FLAGS_M;
156-
auto K = FLAGS_K;
116+
void BatchMatMul::runATenBatchMatMul() {
157117
at::Tensor X = at::CUDA(at::kFloat).rand({B, N, M});
158118
at::Tensor Y = at::CUDA(at::kFloat).rand({B, M, K});
159119
Reference(
160120
[&]() { return bmm(X, Y); },
161121
[&](at::Tensor& res) { bmm_out(res, X, Y); });
162122
}
163123

164-
TEST_F(BatchMatMul, C2TransposedBatchMatMulReference) {
165-
int B = FLAGS_B;
166-
int N = FLAGS_N;
167-
int M = FLAGS_M;
168-
int K = FLAGS_K;
124+
// Generic
125+
TEST_F(BatchMatMul, TransposedBatchMatMul) {
126+
Init(FLAGS_B, FLAGS_N, FLAGS_M, FLAGS_K);
127+
runBatchMatMul(tc::CudaMappingOptions::makeNaiveMappingOptions());
128+
}
169129

170-
Workspace w_ref;
171-
auto AddInput = AddDeterministicallyRandomInput<caffe2::CUDABackend, float>;
172-
AddInput(w_ref, {B, N, M}, "X");
173-
AddInput(w_ref, {B, M, K}, "Y");
174-
OperatorDef ref_def =
175-
MakeOperatorDef<caffe2::CUDABackend>("BatchMatMul", {"X", "Y"}, {"Z"});
176-
std::unique_ptr<OperatorBase> net(CreateOperator(ref_def, &w_ref));
177-
Reference([&]() { return true; }, [&](bool flag) { net->Run(); });
130+
// P100 TC
131+
TEST_F(BatchMatMul, TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26) {
132+
Init(500, 26, 72, 26);
133+
runBatchMatMul(
134+
tc::options_TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26);
135+
}
136+
137+
// P100 ATen
138+
TEST_F(BatchMatMul, TransposedBatchMatMul_ATen_P100_B_500_K_26_M_72_N_26) {
139+
Init(500, 26, 72, 26);
140+
runATenBatchMatMul();
141+
}
142+
143+
// P100 Caffe2
144+
TEST_F(BatchMatMul, TransposedBatchMatMul_Caffe2_P100_B_500_K_26_M_72_N_26) {
145+
Init(500, 26, 72, 26);
146+
runCaffe2BatchMatMul();
147+
}
148+
149+
// V100 TC
150+
TEST_F(BatchMatMul, TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26) {
151+
Init(500, 26, 72, 26);
152+
runBatchMatMul(
153+
tc::options_TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26);
154+
}
155+
156+
// V100 ATen
157+
TEST_F(BatchMatMul, TransposedBatchMatMul_ATen_V100_B_500_K_26_M_72_N_26) {
158+
Init(500, 26, 72, 26);
159+
runATenBatchMatMul();
160+
}
161+
162+
// V100 Caffe2
163+
TEST_F(BatchMatMul, TransposedBatchMatMul_Caffe2_V100_B_500_K_26_M_72_N_26) {
164+
Init(500, 26, 72, 26);
165+
runCaffe2BatchMatMul();
178166
}
179167

180168
int main(int argc, char** argv) {

tc/benchmarks/batchmatmul.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/**
2+
* Copyright (c) 2017-present, Facebook, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include "tc/aten/aten.h"
19+
#include "tc/core/cuda/cuda_mapping_options.h"
20+
21+
namespace tc {
22+
auto options_TransposedBatchMatMul_P100_autotuned_B_500_K_26_M_72_N_26 =
23+
tc::CudaMappingOptions::makeNaiveMappingOptions()
24+
.outerScheduleFusionStrategy(tc::FusionStrategy::Max)
25+
.outerScheduleAllowSkewing(false)
26+
.outerSchedulePositiveOrthant(true)
27+
.intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
28+
.intraTileScheduleAllowSkewing(false)
29+
.intraTileSchedulePositiveOrthant(true)
30+
.tile(3)
31+
.mapToThreads(4, 36, 3)
32+
.mapToBlocks(512)
33+
.unroll(64)
34+
.tileImperfectlyNested(false)
35+
.useSharedMemory(true)
36+
.usePrivateMemory(false)
37+
.unrollCopyShared(true)
38+
.matchLibraryCalls(true);
39+
40+
auto options_TransposedBatchMatMul_V100_autotuned_B_500_K_26_M_72_N_26 =
41+
tc::CudaMappingOptions::makeNaiveMappingOptions()
42+
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
43+
.outerScheduleAllowSkewing(false)
44+
.outerSchedulePositiveOrthant(true)
45+
.intraTileScheduleFusionStrategy(
46+
tc::FusionStrategy::Preserve3Coincident)
47+
.intraTileScheduleAllowSkewing(false)
48+
.intraTileSchedulePositiveOrthant(true)
49+
.fixParametersBeforeScheduling(true)
50+
.tile(1, 1, 32, 32)
51+
.unroll(1)
52+
.tileImperfectlyNested(false)
53+
.matchLibraryCalls(false)
54+
.mapToThreads(72)
55+
.mapToBlocks(500)
56+
.useSharedMemory(true)
57+
.usePrivateMemory(true)
58+
.unrollCopyShared(false)
59+
.useReadOnlyCache(false);
60+
} // namespace tc

0 commit comments

Comments
 (0)