diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
deleted file mode 100644
index 716c42e8ede..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform PRECISION restrict Block {
-  ivec4 out_limits;
-  ivec4 in_sizes;
-  // output dims
-  ivec4 out_ndims;
-  // x = output channels aligned to 4, y = input channels aligned to 4
-  ivec2 channel_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-#extension GL_EXT_control_flow_attributes : require
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits.xyz))) {
-    return;
-  }
-
-  VEC4_T outval = VEC4_T(0.0);
-
-  // scale up output position's packed dim
-  pos[packed_dim] <<= 2;
-
-  // index of packed dim in bchw format
-  const int in_packed_dim_bchw_index = 3 - packed_dim;
-
-  // determine input position based on output position and permute map
-  // out_ndims is in BCHW format
-  ivec4 in_bchw_pos = ivec4(0); // holds b,c,h,w
-  in_bchw_pos[out_ndims[0]] = (pos.z / channel_info.x);
-  in_bchw_pos[out_ndims[1]] = (pos.z % channel_info.x);
-  in_bchw_pos[out_ndims[2]] = pos.y;
-  in_bchw_pos[out_ndims[3]] = pos.x;
-
-  const int in_packed_dim_size = in_sizes[3 - out_ndims[in_packed_dim_bchw_index]];
-
-  [[unroll]] for (int j = 0, bchw_index = in_bchw_pos[out_ndims[in_packed_dim_bchw_index]]; j < 4; ++j, ++bchw_index) {
-    // terminate the loop if trying to access input texture out of bounds
-    if (bchw_index >= in_packed_dim_size) {
-      break;
-    }
-    // go to position in the input, that is mapped to the packed dim in the output
-    in_bchw_pos[out_ndims[in_packed_dim_bchw_index]] = bchw_index;
-
-    ivec3 fetch_pos;
-
-    fetch_pos.xy = in_bchw_pos.wz;
-    // calculate input position in z axis using batch and channel index which is in_bchw_pos.x and in_bchw_pos.y respectively
-    fetch_pos.z = in_bchw_pos.y + in_bchw_pos.x * channel_info.y;
-
-    // input tensor's packed dim lane corresponding to output tensor's pos
-    const int in_packed_dim_lane_index = fetch_pos[packed_dim] & 0x3;
-
-    // scale down input tensor's packed dim pos to perform fetch
-    fetch_pos[packed_dim] >>= 2;
-
-    // fetch input texel
-    VEC4_T inval = VEC4_T(load_texel(t_in, fetch_pos));
-    outval[j] = inval[in_packed_dim_lane_index];
-  }
-
-  pos[packed_dim] = int(gl_GlobalInvocationID[packed_dim]);
-
-  imageStore(t_out, pos, outval);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl
new file mode 100644
index 00000000000..55b9e3dc9ea
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_strides")}
+${layout_declare_ubo(B, "int", "out_numel")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 in_strides;
+  ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
+};
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Convert output tensor index to input tensor index based on permutation
+ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
+  ivec4 in_tidx;
+
+  // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
+  in_tidx[permute_dims.x] = out_tidx.x;
+  in_tidx[permute_dims.y] = out_tidx.y;
+  in_tidx[permute_dims.z] = out_tidx.z;
+  in_tidx[permute_dims.w] = out_tidx.w;
+
+  return in_tidx;
+}
+
+void main() {
+  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  // Convert buffer index to tensor index for output
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
+
+  // Convert output tensor index to input tensor index using permutation
+  const ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+
+  // Convert input tensor index back to buffer index
+  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+
+  // Copy data from input to output
+  t_out[out_bufi] = t_in[in_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml
similarity index 73%
rename from backends/vulkan/runtime/graph/ops/glsl/permute.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml
index a90ddcb41ce..81675ae8917 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/permute.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml
@@ -1,12 +1,10 @@
-permute:
+permute_buffer:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
       - VALUE: int32
   shader_variants:
-    - NAME: permute
+    - NAME: permute_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
new file mode 100644
index 00000000000..274077f4181
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("texture3d")}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
+};
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int out_packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+const lowp int in_packed_dim = unhash_packed_dim(in_layout);
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Convert output tensor index to input tensor index based on permutation
+ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
+  ivec4 in_tidx;
+
+  // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
+  in_tidx[permute_dims.x] = out_tidx.x;
+  in_tidx[permute_dims.y] = out_tidx.y;
+  in_tidx[permute_dims.z] = out_tidx.z;
+  in_tidx[permute_dims.w] = out_tidx.w;
+
+  return in_tidx;
+}
+
+// Check if we can use the fast path where texels from the input tensor can be
+// copied directly into the output tensor. This occurs when the packed dimension
+// is preserved in the permutation, i.e. reading a texel from the output tensor
+// produces 4 texels along the same dimension as reading a texel from the input
+// tensor.
+bool can_use_fast_path() {
+  // Fast path is possible when the packed dimension is preserved in the permutation
+  // This means permute_dims[out_packed_dim] == in_packed_dim
+  return permute_dims[out_packed_dim] == in_packed_dim;
+}
+
+void main() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+
+  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+    return;
+  }
+
+  if (can_use_fast_path()) {
+    // Fast path: packed dimension is preserved, so we can copy texels directly
+    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
+    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+
+    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
+  }
+  else {
+    // Slow path: packed dimension is not preserved, so each element of the
+    // output texel may be "sourced" from a different texel in the input tensor.
+    // Therefore each output texel element is processed individually.
+    VEC4_T out_texel = VEC4_T(0);
+
+    for (int texel_i = 0; texel_i < 4; ++texel_i) {
+      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
+      int element_idx = in_tidx[in_packed_dim] % 4;
+
+      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+      T selected_value = T(in_texel[element_idx]);
+
+      out_texel[texel_i] = selected_value;
+
+      out_tidx[out_packed_dim]++;
+    }
+
+    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml
new file mode 100644
index 00000000000..f68b8dcdd3d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml
@@ -0,0 +1,10 @@
+permute_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int32
+  shader_variants:
+    - NAME: permute_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index fba3f03467b..6e6a6fa3bf2 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -10,6 +10,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
@@ -100,54 +101,76 @@ void add_permute_node(
     const ValueRef out) {
   check_args(graph, in, permute_dims, out);
 
-  ivec4 out_dims{0, 1, 2, 3};
-
-  // Special cases of squeeze/unsqueeze. Because the input dim size can be
-  // different with output dim size. So pick graph.dim_of(in) if squeeze, and
-  // graph.dim_of(out) if unsqueeze to create parameter for permute.
-  const int64_t out_ndim = std::max(graph.dim_of(in), graph.dim_of(out));
-  std::vector<bool> seen(out_ndim);
+  // Convert the permute dims to WHCN dimension order, which is the standard in
+  // our compute shaders. The following transformations are applied.
+  // 1. Change dimension index values from NCHW order valueto WHCN order value
+  // 2. Reverse the order of the permute array from NCHW order to WHCN order
+  ivec4 whcn_permute_dims{0, 1, 2, 3};
   {
     IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
-    for (int i = 0; i < out_ndim; i++) {
-      int64_t permute_dim = permute_dims_ptr->at(i);
-      VK_CHECK_COND(
-          !seen[permute_dim], "Argument dim ", permute_dim, "  is repeated");
-      seen[permute_dim] = true;
+    const int32_t permute_ndim =
+        utils::safe_downcast<int>(permute_dims_ptr->size());
+
+    for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0;
+         nchw_i--, whcn_i++) {
+      const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i);
+      const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw;
 
-      out_dims[(4u - out_ndim) + i] =
-          utils::safe_downcast<int32_t>(permute_dim + (4 - out_ndim));
+      whcn_permute_dims[whcn_i] = permute_dim_whcn;
     }
   }
 
   std::string kernel_name = "permute";
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  const int32_t out_channels = dim_at<kChannel4D>(graph.sizes_of(out));
-  const int32_t in_channels = dim_at<kChannel4D>(graph.sizes_of(in));
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+  vkapi::SpecVarList spec_vars;
 
-  const int32_t packed_dim = graph.packed_dim_of(in);
-  ivec2 channel_info = {out_channels, in_channels};
-  if (packed_dim == WHCN::kChannelsDim) {
-    channel_info[0] = utils::align_up_4(channel_info[0]);
-    channel_info[1] = utils::align_up_4(channel_info[1]);
-  }
+  if (graph.is_buffer_storage(out)) {
+    param_buffers.append(graph.sizes_ubo(in));
+    param_buffers.append(graph.strides_ubo(out));
+    param_buffers.append(graph.numel_ubo(out));
+
+    // Buffer storage - use permute_buffer shader
+    push_constants = {
+        graph.strides_pc_of(in),
+        PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims)),
+    };
+
+    spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)};
+  } else {
+    // Texture storage - use permute_texture shader
+    const int32_t out_channels = dim_at<kChannel4D>(graph.sizes_of(out));
+    const int32_t in_channels = dim_at<kChannel4D>(graph.sizes_of(in));
+
+    const int32_t packed_dim = graph.packed_dim_of(in);
+    ivec2 channel_info = {out_channels, in_channels};
+    if (packed_dim == WHCN::kChannelsDim) {
+      channel_info[0] = utils::align_up_4(channel_info[0]);
+      channel_info[1] = utils::align_up_4(channel_info[1]);
+    }
+
+    push_constants = {
+        graph.sizes_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))};
 
-  const vkapi::SpecVarList spec_vars = {packed_dim};
+    spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)};
+  }
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      {},
+      // Parameter buffers
+      param_buffers,
       // Push Constants
-      {{graph.logical_limits_pc_of(out),
-        graph.sizes_pc_of(in),
-        PushConstantDataInfo(&out_dims, sizeof(out_dims)),
-        PushConstantDataInfo(&channel_info, sizeof(channel_info))}},
+      push_constants,
       // Specialization Constants
       spec_vars,
       // Resize Args
diff --git a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp
index 306a79fb8b8..c4de5d88f30 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp
@@ -26,6 +26,9 @@ void add_unsqueeze_node(
       in_dim < 4, "Cannot unsqueeze a tensor with more than 3 dimensions");
 
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  if (dim < 0) {
+    dim += out_dim;
+  }
 
   std::vector<int64_t> permute_dims(out_dim);
   for (int i = 1; i <= dim; i++) {
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 813807445f0..92f73268ebf 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -752,6 +752,13 @@ def get_permute_inputs():
         "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
+    test_suite.storage_types = [
+        "utils::kBuffer",
+        "utils::kTexture3D",
+    ]
+    test_suite.dtypes = [
+        "at::kFloat",
+    ]
     return test_suite
 
 
@@ -990,9 +997,11 @@ def get_unsqueeze_inputs():
             ((9, 9), 2),
             ((9,), 0),
             ((9,), 1),
+            ((1, 10), -1),
         ]
     )
     test_suite.layouts = [
+        "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"