Skip to content

Commit f1ceb6c

Browse files
authored
Adding bmm, mm, view_copy, slice_copy, split_with_sizes_copy optimizations
Differential Revision: D72798507 Pull Request resolved: #9877
1 parent 6b877de commit f1ceb6c

File tree

7 files changed

+642
-8
lines changed

7 files changed

+642
-8
lines changed

backends/cadence/aot/functions_hifi.yaml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
- op: bmm.out
3636
kernels:
3737
- arg_meta: null
38-
kernel_name: torch::executor::bmm_out
38+
kernel_name: cadence::impl::HiFi::bmm_out
3939

4040
- op: cat.out
4141
kernels:
@@ -107,6 +107,11 @@
107107
- arg_meta: null
108108
kernel_name: cadence::impl::HiFi::minimum_out
109109

110+
- op: mm.out
111+
kernels:
112+
- arg_meta: null
113+
kernel_name: cadence::impl::HiFi::mm_out
114+
110115
- op: mul.out
111116
kernels:
112117
- arg_meta: null
@@ -150,12 +155,12 @@
150155
- op: slice_copy.Tensor_out
151156
kernels:
152157
- arg_meta: null
153-
kernel_name: torch::executor::slice_copy_Tensor_out
158+
kernel_name: cadence::impl::HiFi::slice_copy_Tensor_out
154159

155160
- op: split_with_sizes_copy.out
156161
kernels:
157162
- arg_meta: null
158-
kernel_name: torch::executor::split_with_sizes_copy_out
163+
kernel_name: cadence::impl::HiFi::split_with_sizes_copy_out
159164

160165
- op: sub.out
161166
kernels:
@@ -170,7 +175,7 @@
170175
- op: view_copy.out
171176
kernels:
172177
- arg_meta: null
173-
kernel_name: torch::executor::view_copy_out
178+
kernel_name: cadence::impl::HiFi::view_copy_out
174179

175180
- op: where.self_out
176181
kernels:

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,34 +22,35 @@ endif()
2222
set(_aten_ops__srcs
2323
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
2424
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp"
25+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bmm.cpp"
2526
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp"
2627
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_clamp.cpp"
2728
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
2829
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp"
2930
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
3031
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
3132
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
33+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mm.cpp"
3234
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
3335
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp"
3436
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp"
3537
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_remainder.cpp"
3638
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
39+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_slice_copy.cpp"
3740
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_softmax.cpp"
41+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_split_with_sizes_copy.cpp"
3842
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
3943
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
4044
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
45+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_view_copy.cpp"
4146
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
42-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
4347
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
4448
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
4549
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gt.cpp"
4650
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp"
4751
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp"
4852
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp"
49-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
50-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
5153
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
52-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
5354
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
5455
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
5556
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
10+
#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
11+
#include <executorch/kernels/portable/cpu/vec_ops.h>
12+
#include <executorch/runtime/kernel/kernel_includes.h>
13+
14+
using Tensor = exec_aten::Tensor;
15+
using exec_aten::ScalarType;
16+
using executorch::runtime::KernelRuntimeContext;
17+
using executorch::runtime::kTensorDimensionLimit;
18+
using executorch::runtime::resize_tensor;
19+
using executorch::runtime::tensors_have_same_dim_order;
20+
using executorch::runtime::tensor_is_default_dim_order;
21+
using torch::executor::check_bmm_args;
22+
using torch::executor::Error;
23+
using torch::executor::get_bmm_out_target_size;
24+
25+
namespace cadence {
26+
namespace impl {
27+
namespace HiFi {
28+
namespace native {
29+
30+
Tensor& bmm_out(
31+
KernelRuntimeContext& ctx,
32+
const Tensor& in,
33+
const Tensor& mat2,
34+
Tensor& out) {
35+
ET_KERNEL_CHECK(ctx, check_bmm_args(in, mat2, out), InvalidArgument, out);
36+
37+
ET_KERNEL_CHECK(
38+
ctx, tensors_have_same_dim_order(in, mat2, out), InvalidArgument, out);
39+
40+
ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
41+
42+
size_t output_ndim = 0;
43+
exec_aten::SizesType output_sizes[kTensorDimensionLimit];
44+
get_bmm_out_target_size(in, mat2, output_sizes, &output_ndim);
45+
ET_KERNEL_CHECK(
46+
ctx,
47+
resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok,
48+
InvalidArgument,
49+
out);
50+
51+
constexpr auto name = "bmm.out";
52+
constexpr int kNnlibMaxDim = 3;
53+
54+
bool optimized = true;
55+
56+
if (out.scalar_type() != ScalarType::Float)
57+
optimized = false;
58+
59+
if (in.dim() > kNnlibMaxDim)
60+
optimized = false;
61+
62+
if (optimized) {
63+
const float* in_data = in.const_data_ptr<float>();
64+
const float* mat2_data = mat2.const_data_ptr<float>();
65+
float* out_data = out.mutable_data_ptr<float>();
66+
67+
int64_t batch_size = in.size(0);
68+
int64_t m = in.size(1);
69+
int64_t n = in.size(2);
70+
int64_t p = mat2.size(2);
71+
72+
WORD32 rows = m;
73+
WORD32 cols1 = n;
74+
WORD32 row_stride1 = n;
75+
WORD32 vec_count = p;
76+
WORD32 vec_offset = n;
77+
WORD32 out_offset = 1;
78+
WORD32 out_stride = p;
79+
80+
WORD32* __restrict__ tmp =
81+
(WORD32* __restrict__)kernels::allocate_temp_memory(
82+
ctx, (batch_size * m * p) * sizeof(float));
83+
84+
ET_KERNEL_CHECK(ctx, tmp != nullptr, MemoryAllocationFailed, out);
85+
86+
tmp[batch_size * m * p] = {0};
87+
88+
WORD32* __restrict__ p_o =
89+
(WORD32* __restrict__)kernels::allocate_temp_memory(
90+
ctx, (batch_size * m * p) * sizeof(WORD32));
91+
92+
ET_KERNEL_CHECK(ctx, p_o != nullptr, MemoryAllocationFailed, out);
93+
94+
for (int i = 0; i < batch_size; ++i) {
95+
const FLOAT32* __restrict__ p_mat1 = in_data + i * m * n;
96+
const FLOAT32* __restrict__ p_vec1 = mat2_data + i * n * p;
97+
FLOAT32* __restrict__ p_out = out_data + i * m * p;
98+
const FLOAT32* __restrict__ p_bias = (const FLOAT32* __restrict__)tmp;
99+
100+
WORD32* p_inp = (WORD32*)p_vec1;
101+
102+
WORD32 p_inp_shape[kNnlibMaxDim];
103+
p_inp_shape[0] = n;
104+
p_inp_shape[1] = p;
105+
p_inp_shape[2] = 1;
106+
107+
WORD32 p_out_shape[kNnlibMaxDim];
108+
p_out_shape[0] = p;
109+
p_out_shape[1] = n;
110+
p_out_shape[2] = 1;
111+
112+
WORD32 p_permute_vec[kNnlibMaxDim] = {1, 0, 2};
113+
114+
WORD32 num_out_dims = kNnlibMaxDim;
115+
WORD32 num_inp_dims = kNnlibMaxDim;
116+
117+
xa_nn_transpose_32_32(
118+
p_o,
119+
p_out_shape,
120+
p_inp,
121+
p_inp_shape,
122+
p_permute_vec,
123+
num_out_dims,
124+
num_inp_dims);
125+
126+
const FLOAT32* __restrict__ p_vec = (const FLOAT32* __restrict__)p_o;
127+
128+
xa_nn_matmul_f32xf32_f32(
129+
p_out,
130+
p_mat1,
131+
p_vec,
132+
p_bias,
133+
rows,
134+
cols1,
135+
row_stride1,
136+
vec_count,
137+
vec_offset,
138+
out_offset,
139+
out_stride);
140+
}
141+
142+
return out;
143+
}
144+
145+
ET_SWITCH_REAL_TYPES_AND(Half, in.scalar_type(), ctx, name, CTYPE, [&]() {
146+
const CTYPE* in_data = in.const_data_ptr<CTYPE>();
147+
const CTYPE* mat2_data = mat2.const_data_ptr<CTYPE>();
148+
CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
149+
150+
int64_t batch_size = in.size(0);
151+
int64_t m = in.size(1);
152+
int64_t n = in.size(2);
153+
int64_t p = mat2.size(2);
154+
155+
for (int i = 0; i < batch_size; ++i) {
156+
const CTYPE* in_data_offset = in_data + i * m * n;
157+
const CTYPE* mat2_data_offset = mat2_data + i * n * p;
158+
CTYPE* out_data_offset = out_data + i * m * p;
159+
160+
torch::executor::vec_matmul<CTYPE>(
161+
out_data_offset, in_data_offset, mat2_data_offset, m, n, p);
162+
}
163+
});
164+
165+
return out;
166+
}
167+
168+
} // namespace native
169+
} // namespace HiFi
170+
} // namespace impl
171+
} // namespace cadence

0 commit comments

Comments
 (0)