Skip to content

Reapply "Add vectorized_math.h (#11204)", "Add optimized_portable_kernels test (#11205)", and "Add vectorization in elementwise_util (#9432)" #11682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,10 @@ exclude_patterns = [
'examples/**',
'exir/verification/bindings.cpp',
'extension/**',
# Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
'kernels/portable/cpu/util/elementwise_util.h',
'kernels/portable/cpu/util/math_util.h',
'kernels/portable/cpu/util/vectorized_math.h',
'kernels/optimized/**',
'runtime/core/exec_aten/**',
# Want to be able to keep c10 in sync with PyTorch core.
Expand Down
2 changes: 1 addition & 1 deletion kernels/optimized/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ message("Generated files ${gen_command_sources}")
list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
add_library(optimized_kernels ${_optimized_kernels__srcs})
target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
target_compile_definitions(optimized_kernels PRIVATE "ET_USE_PYTORCH_HEADERS=ET_HAS_EXCEPTIONS")
target_link_libraries(
optimized_kernels PUBLIC executorch_core cpublas extension_threadpool kernels_util_all_deps
)
Expand Down
11 changes: 9 additions & 2 deletions kernels/portable/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,16 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
target_compile_definitions(optimized_portable_kernels PRIVATE "ET_USE_PYTORCH_HEADERS=ET_HAS_EXCEPTIONS")
gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
generate_bindings_for_kernels(
LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
)
gen_operators_lib(
LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core
)
install(
TARGETS optimized_portable_kernels
TARGETS optimized_portable_kernels optimized_portable_ops_lib
DESTINATION lib
)
endif()
Expand Down
12 changes: 8 additions & 4 deletions kernels/portable/cpu/op_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,18 @@ Tensor& add_scalar_out(
static constexpr const char op_name[] = "add.Scalar_out";

ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
auto val_alpha_times_b = val_alpha * val_b;
utils::apply_unitensor_elementwise_fn<
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
[b, alpha](const auto val_a) {
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
return val_a + val_alpha * val_b;
[val_alpha_times_b](const auto val_a) {
// Cast here supports vectorization; either it does nothing
// or it casts from CTYPE_COMPUTE to
// Vectorized<CTYPE_COMPUTE>.
return val_a + decltype(val_a)(val_alpha_times_b);
},
ctx,
a,
Expand Down
2 changes: 1 addition & 1 deletion kernels/portable/cpu/op_atan2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Tensor& atan2_out(
op_name,
utils::SupportedTensorDtypes::FLOATHBF16>(
[](const auto val_a, const auto val_b) {
return std::atan2(val_a, val_b);
return executorch::math::atan2(val_a, val_b);
},
ctx,
a,
Expand Down
5 changes: 2 additions & 3 deletions kernels/portable/cpu/op_clamp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,8 @@ Tensor& clamp_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
[has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
// TODO: rewrite this to be vectorization-capable.
CTYPE_COMPUTE val_out = val_in;
[has_min, min_opt, has_max, max_opt](const auto val_in) {
auto val_out = val_in;
if (has_min) {
val_out = utils::max_override(
val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));
Expand Down
3 changes: 1 addition & 2 deletions kernels/portable/cpu/op_elu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ Tensor& elu_out(
CTYPE,
op_name,
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
[negcoef, math_scale, math_input_scale](const auto x) {
// TODO: rewrite this to be vectorization-capable.
[negcoef, math_scale, math_input_scale](const CTYPE x) {
return MathT(x) <= MathT(0)
? std::expm1(MathT(x) * math_input_scale) * negcoef
: MathT(x) * math_scale;
Expand Down
8 changes: 3 additions & 5 deletions kernels/portable/cpu/op_fmod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out(
utils::SupportedTensorDtypes::REALHBF16>(
[&div_by_zero_error](
const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
// TODO: rewrite this to be vectorization-capable.
// TODO: rewrite this to be vectorization-capable?
CTYPE_COMPUTE value = 0;
if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
if (val_b == 0) {
Expand Down Expand Up @@ -138,10 +138,8 @@ Tensor& fmod_Scalar_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::REALHBF16>(
[val_b](const CTYPE_COMPUTE val_a) {
// TODO: rewrite this to be vectorization-capable.
CTYPE_COMPUTE value = std::fmod(val_a, val_b);
return value;
[val_b](const auto val_a) {
return executorch::math::fmod(val_a, (decltype(val_a))val_b);
},
ctx,
a,
Expand Down
2 changes: 1 addition & 1 deletion kernels/portable/cpu/op_maximum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Tensor& maximum_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::REALHBBF16>(
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
[](const auto val_a, const auto val_b) {
return utils::max_override(val_a, val_b);
},
ctx,
Expand Down
3 changes: 1 addition & 2 deletions kernels/portable/cpu/op_minimum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ Tensor& minimum_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::REALHBBF16>(
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
// TODO: rewrite this to be vectorization-capable.
[](const auto val_a, const auto val_b) {
return utils::min_override(val_a, val_b);
},
ctx,
Expand Down
4 changes: 1 addition & 3 deletions kernels/portable/cpu/op_mul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,7 @@ Tensor& mul_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::REALHBBF16>(
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
return val_a * val_b;
},
[](const auto val_a, const auto val_b) { return val_a * val_b; },
ctx,
a,
utils::SupportedTensorDtypes::REALHBBF16,
Expand Down
10 changes: 6 additions & 4 deletions kernels/portable/cpu/op_native_dropout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,11 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
}
ET_SWITCH_FLOATHBF16_TYPES(
input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
[](const auto val, const auto mask_val) {
utils::apply_bitensor_elementwise_fn<
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
[](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
if (!mask_val) {
return static_cast<decltype(val)>(0);
}
Expand All @@ -70,8 +73,7 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
mask,
// TODO: should really be just BOOL
utils::SupportedTensorDtypes::BOOL_OR_BYTE,
out,
utils::SupportedTensorDtypes::SAME_AS_COMMON);
out);
});
} else if (input.numel() > 0) {
std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());
Expand Down
23 changes: 16 additions & 7 deletions kernels/portable/cpu/op_pow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,8 @@ Tensor& pow_Tensor_Tensor_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::REALHBF16>(
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
// TODO: rewrite this to be vectorization-capable.
return std::pow(val_a, val_b);
[](const auto val_a, const auto val_b) {
return executorch::math::pow(val_a, val_b);
},
ctx,
a,
Expand Down Expand Up @@ -111,8 +110,13 @@ Tensor& pow_Tensor_Scalar_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::REALHBF16>(
// TODO: rewrite this to be vectorization-capable.
[val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
// Casting val_b here supports vectorization; it does
// nothing if we are not vectorizing (casts to
// CTYPE_COMPUTE) and casts to a vectorized type
// otherwise.
[val_b](const auto val_a) {
return executorch::math::pow(val_a, decltype(val_a)(val_b));
},
ctx,
a,
utils::SupportedTensorDtypes::REALHBBF16,
Expand Down Expand Up @@ -161,8 +165,13 @@ Tensor& pow_Scalar_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::REALHBF16>(
// TODO: rewrite this to be vectorization-capable.
[val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
// Casting val_a here supports vectorization; it does
// nothing if we are not vectorizing (casts to
// CTYPE_COMPUTE) and casts to a vectorized type
// otherwise.
[val_a](const auto val_b) {
return executorch::math::pow(decltype(val_b)(val_a), val_b);
},
ctx,
b,
utils::SupportedTensorDtypes::REALHBBF16,
Expand Down
7 changes: 3 additions & 4 deletions kernels/portable/cpu/op_sigmoid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,9 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::FLOATHBF16>(
[](const auto val_in) -> CTYPE_COMPUTE {
// TODO: rewrite this to be vectorization-capable
CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
(static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
[](const auto val_in) {
const auto one = static_cast<decltype(val_in)>(1.0);
auto out_val = one / (one + executorch::math::exp(-val_in));
return out_val;
},
ctx,
Expand Down
7 changes: 4 additions & 3 deletions kernels/portable/cpu/op_sub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Tensor& sub_out(
op_name,
utils::SupportedTensorDtypes::REALHBF16>(
[val_alpha](const auto val_a, const auto val_b) {
return val_a - val_alpha * val_b;
return val_a - (decltype(val_b))(val_alpha)*val_b;
},
ctx,
a,
Expand Down Expand Up @@ -112,12 +112,13 @@ Tensor& sub_scalar_out(
ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
const auto val_alpha_times_b = val_alpha * val_b;
utils::apply_unitensor_elementwise_fn<
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
[val_b, val_alpha](const auto val_a) {
return val_a - val_alpha * val_b;
[val_alpha_times_b](const auto val_a) {
return val_a - (decltype(val_a))(val_alpha_times_b);
},
ctx,
a,
Expand Down
6 changes: 3 additions & 3 deletions kernels/portable/cpu/op_where.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ Tensor& where_out(
CTYPE_COMPUTE,
op_name,
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
[](const auto val_a, const auto val_b, const auto val_c) {
return val_c ? val_a : val_b;
},
[](const CTYPE_COMPUTE val_a,
const CTYPE_COMPUTE val_b,
const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
ctx,
a,
utils::SupportedTensorDtypes::REALHBBF16,
Expand Down
Loading
Loading