diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl deleted file mode 100644 index 716c42e8ede..00000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#version 450 core - -#define PRECISION ${PRECISION} - -#define VEC4_T ${texel_type(DTYPE)} - -layout(std430) buffer; - -#include "indexing_utils.h" - -${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)} - -layout(push_constant) uniform PRECISION restrict Block { - ivec4 out_limits; - ivec4 in_sizes; - // output dims - ivec4 out_ndims; - // x = output channels aligned to 4, y = input channels aligned to 4 - ivec2 channel_info; -}; - -layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -layout(constant_id = 3) const int packed_dim = C_DIM; - -#extension GL_EXT_control_flow_attributes : require - -void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - - if (any(greaterThanEqual(pos, out_limits.xyz))) { - return; - } - - VEC4_T outval = VEC4_T(0.0); - - // scale up output position's packed dim - pos[packed_dim] <<= 2; - - // index of packed dim in bchw format - const int in_packed_dim_bchw_index = 3 - packed_dim; - - // determine input position based on output position and permute map - // out_ndims is in BCHW format - ivec4 in_bchw_pos = ivec4(0); // holds b,c,h,w - in_bchw_pos[out_ndims[0]] = (pos.z / channel_info.x); - in_bchw_pos[out_ndims[1]] = (pos.z % channel_info.x); - in_bchw_pos[out_ndims[2]] = pos.y; - in_bchw_pos[out_ndims[3]] = pos.x; - - const int in_packed_dim_size = in_sizes[3 - out_ndims[in_packed_dim_bchw_index]]; - - [[unroll]] for (int j = 0, bchw_index = in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]; j < 4; ++j, ++bchw_index) { - // terminate the loop if trying to access input texture out of bounds - if (bchw_index >= in_packed_dim_size) { - break; - } - // go to position in the input, that is mapped to the packed dim in the output - in_bchw_pos[out_ndims[in_packed_dim_bchw_index]] = bchw_index; - - ivec3 fetch_pos; - - fetch_pos.xy = in_bchw_pos.wz; - // calculate input position in z axis using batch and channel index which is in_bchw_pos.x and in_bchw_pos.y respectively - fetch_pos.z = in_bchw_pos.y + in_bchw_pos.x * channel_info.y; - - // input tensor's packed dim lane corresponding to output tensor's pos - const int in_packed_dim_lane_index = fetch_pos[packed_dim] & 0x3; - - // scale down input tensor's packed dim pos to perform fetch - fetch_pos[packed_dim] >>= 2; - - // fetch input texel - VEC4_T inval = VEC4_T(load_texel(t_in, fetch_pos)); - outval[j] = inval[in_packed_dim_lane_index]; - } - - pos[packed_dim] = int(gl_GlobalInvocationID[packed_dim]); - - imageStore(t_out, pos, outval); -} diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl new file mode 100644 index 00000000000..55b9e3dc9ea --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("buffer")} +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")} + +${layout_declare_ubo(B, "ivec4", "in_sizes")} +${layout_declare_ubo(B, "ivec4", "out_strides")} +${layout_declare_ubo(B, "int", "out_numel")} + +layout(push_constant) uniform restrict Block { + ivec4 in_strides; + ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j +}; + +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} + +const lowp ivec4 out_dim_order = unhash_dim_order(out_layout); + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Convert output tensor index to input tensor index based on permutation +ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { + ivec4 in_tidx; + + // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i] + in_tidx[permute_dims.x] = out_tidx.x; + in_tidx[permute_dims.y] = out_tidx.y; + in_tidx[permute_dims.z] = out_tidx.z; + in_tidx[permute_dims.w] = out_tidx.w; + + return in_tidx; +} + +void main() { + const int out_bufi = ivec3(gl_GlobalInvocationID).x; + if (out_bufi >= out_numel) { + return; + } + + // Convert buffer index to tensor index for output + const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order); + + // Convert output tensor index to input tensor index using permutation + const ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); + + // Convert input tensor index back to buffer index + const int in_bufi = tidx_to_bufi(in_tidx, in_strides); + + // Copy data from input to output + t_out[out_bufi] = t_in[in_bufi]; +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml similarity index 73% rename from backends/vulkan/runtime/graph/ops/glsl/permute.yaml rename to backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml index a90ddcb41ce..81675ae8917 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/permute.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml @@ -1,12 +1,10 @@ -permute: +permute_buffer: parameter_names_with_default_values: DTYPE: float - NDIM: 3 - STORAGE: texture3d generate_variant_forall: DTYPE: - VALUE: half - VALUE: float - VALUE: int32 shader_variants: - - NAME: permute + - NAME: permute_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl new file mode 100644 index 00000000000..274077f4181 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl @@ -0,0 +1,103 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#version 450 core + +#define PRECISION ${PRECISION} + +#define VEC4_T ${texel_type(DTYPE)} +#define T ${buffer_scalar_type(DTYPE)} + +${define_active_storage_type("texture3d")} +${define_required_extensions(DTYPE)} + +layout(std430) buffer; + +#include "indexing_utils.h" + +${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")} +${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")} + +layout(push_constant) uniform restrict Block { + ivec4 out_sizes; + ivec4 in_sizes; + ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j +}; + +${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")} +const lowp ivec4 out_axis_map = unhash_axis_map(out_layout); +const lowp int out_packed_dim = unhash_packed_dim(out_layout); + +${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")} +const lowp ivec4 in_axis_map = unhash_axis_map(in_layout); +const lowp int in_packed_dim = unhash_packed_dim(in_layout); + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Convert output tensor index to input tensor index based on permutation +ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) { + ivec4 in_tidx; + + // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i] + in_tidx[permute_dims.x] = out_tidx.x; + in_tidx[permute_dims.y] = out_tidx.y; + in_tidx[permute_dims.z] = out_tidx.z; + in_tidx[permute_dims.w] = out_tidx.w; + + return in_tidx; +} + +// Check if we can use the fast path where texels from the input tensor can be +// copied directly into the output tensor. This occurs when the packed dimension +// is preserved in the permutation, i.e. reading a texel from the output tensor +// produces 4 texels along the same dimension as reading a texel from the input +// tensor. +bool can_use_fast_path() { + // Fast path is possible when the packed dimension is preserved in the permutation + // This means permute_dims[out_packed_dim] == in_packed_dim + return permute_dims[out_packed_dim] == in_packed_dim; +} + +void main() { + const ivec3 lpos = ivec3(gl_GlobalInvocationID); + ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim); + + if (any(greaterThanEqual(out_tidx, out_sizes))) { + return; + } + + if (can_use_fast_path()) { + // Fast path: packed dimension is preserved, so we can copy texels directly + ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); + ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); + VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + + write_texel_lpos(t_out, lpos, in_texel, out_axis_map); + } + else { + // Slow path: packed dimension is not preserved, so each element of the + // output texel may be "sourced" from a different texel in the input tensor. + // Therefore each output texel element is processed individually. + VEC4_T out_texel = VEC4_T(0); + + for (int texel_i = 0; texel_i < 4; ++texel_i) { + ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx); + ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim); + int element_idx = in_tidx[in_packed_dim] % 4; + + VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos)); + T selected_value = T(in_texel[element_idx]); + + out_texel[texel_i] = selected_value; + + out_tidx[out_packed_dim]++; + } + + write_texel_lpos(t_out, lpos, out_texel, out_axis_map); + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml new file mode 100644 index 00000000000..f68b8dcdd3d --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml @@ -0,0 +1,10 @@ +permute_texture: + parameter_names_with_default_values: + DTYPE: float + generate_variant_forall: + DTYPE: + - VALUE: half + - VALUE: float + - VALUE: int32 + shader_variants: + - NAME: permute_texture3d diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp index fba3f03467b..6e6a6fa3bf2 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -100,54 +101,76 @@ void add_permute_node( const ValueRef out) { check_args(graph, in, permute_dims, out); - ivec4 out_dims{0, 1, 2, 3}; - - // Special cases of squeeze/unsqueeze. Because the input dim size can be - // different with output dim size. So pick graph.dim_of(in) if squeeze, and - // graph.dim_of(out) if unsqueeze to create parameter for permute. - const int64_t out_ndim = std::max(graph.dim_of(in), graph.dim_of(out)); - std::vector seen(out_ndim); + // Convert the permute dims to WHCN dimension order, which is the standard in + // our compute shaders. The following transformations are applied. + // 1. Change dimension index values from NCHW order valueto WHCN order value + // 2. Reverse the order of the permute array from NCHW order to WHCN order + ivec4 whcn_permute_dims{0, 1, 2, 3}; { IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims); - for (int i = 0; i < out_ndim; i++) { - int64_t permute_dim = permute_dims_ptr->at(i); - VK_CHECK_COND( - !seen[permute_dim], "Argument dim ", permute_dim, " is repeated"); - seen[permute_dim] = true; + const int32_t permute_ndim = + utils::safe_downcast(permute_dims_ptr->size()); + + for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0; + nchw_i--, whcn_i++) { + const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i); + const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw; - out_dims[(4u - out_ndim) + i] = - utils::safe_downcast(permute_dim + (4 - out_ndim)); + whcn_permute_dims[whcn_i] = permute_dim_whcn; } } std::string kernel_name = "permute"; kernel_name.reserve(kShaderNameReserve); + add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); add_dtype_suffix(kernel_name, graph.dtype_of(out)); - const int32_t out_channels = dim_at(graph.sizes_of(out)); - const int32_t in_channels = dim_at(graph.sizes_of(in)); + vkapi::ParamsBindList param_buffers; + std::vector push_constants; + vkapi::SpecVarList spec_vars; - const int32_t packed_dim = graph.packed_dim_of(in); - ivec2 channel_info = {out_channels, in_channels}; - if (packed_dim == WHCN::kChannelsDim) { - channel_info[0] = utils::align_up_4(channel_info[0]); - channel_info[1] = utils::align_up_4(channel_info[1]); - } + if (graph.is_buffer_storage(out)) { + param_buffers.append(graph.sizes_ubo(in)); + param_buffers.append(graph.strides_ubo(out)); + param_buffers.append(graph.numel_ubo(out)); + + // Buffer storage - use permute_buffer shader + push_constants = { + graph.strides_pc_of(in), + PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims)), + }; + + spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; + } else { + // Texture storage - use permute_texture shader + const int32_t out_channels = dim_at(graph.sizes_of(out)); + const int32_t in_channels = dim_at(graph.sizes_of(in)); + + const int32_t packed_dim = graph.packed_dim_of(in); + ivec2 channel_info = {out_channels, in_channels}; + if (packed_dim == WHCN::kChannelsDim) { + channel_info[0] = utils::align_up_4(channel_info[0]); + channel_info[1] = utils::align_up_4(channel_info[1]); + } + + push_constants = { + graph.sizes_pc_of(out), + graph.sizes_pc_of(in), + PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))}; - const vkapi::SpecVarList spec_vars = {packed_dim}; + spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}; + } - graph.execute_nodes().emplace_back(new DispatchNode( + graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), - graph.create_global_wg_size(out), - graph.create_local_wg_size(out), + default_pick_global_wg_size, + default_pick_local_wg_size, {{out, vkapi::kWrite}, {in, vkapi::kRead}}, - {}, + // Parameter buffers + param_buffers, // Push Constants - {{graph.logical_limits_pc_of(out), - graph.sizes_pc_of(in), - PushConstantDataInfo(&out_dims, sizeof(out_dims)), - PushConstantDataInfo(&channel_info, sizeof(channel_info))}}, + push_constants, // Specialization Constants spec_vars, // Resize Args diff --git a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp index 306a79fb8b8..c4de5d88f30 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp @@ -26,6 +26,9 @@ void add_unsqueeze_node( in_dim < 4, "Cannot unsqueeze a tensor with more than 3 dimensions"); int64_t dim = graph.extract_scalar(dim_ref); + if (dim < 0) { + dim += out_dim; + } std::vector permute_dims(out_dim); for (int i = 1; i <= dim; i++) { diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 813807445f0..92f73268ebf 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -752,6 +752,13 @@ def get_permute_inputs(): "utils::kHeightPacked", "utils::kChannelsPacked", ] + test_suite.storage_types = [ + "utils::kBuffer", + "utils::kTexture3D", + ] + test_suite.dtypes = [ + "at::kFloat", + ] return test_suite @@ -990,9 +997,11 @@ def get_unsqueeze_inputs(): ((9, 9), 2), ((9,), 0), ((9,), 1), + ((1, 10), -1), ] ) test_suite.layouts = [ + "utils::kWidthPacked", "utils::kChannelsPacked", ] test_suite.data_gen = "make_seq_tensor"