[ET-VK][ez] Add support for buffer backed qparams in int4 linear + add checks for physical limits when allocating (#10233)

pytorchbot · web-flow · commit 5999f101dc56 · 2025-04-16T14:54:47.000-04:00
## Context Currently, the groupwise quantized int4 linear op implementation forces the scales and zero tensor to be a `Texture3D`. However, for i.e. transformer models that have a logit linear layer, the image extents required may exceed the maximum image extents available on the device. ## Changes * Add support for the scales and zero tensor being a `Buffer` instead of a `Texture3D` * Add checks when allocating buffers or images for tensors that the requested resource fits within the physical device limits Differential Revision: [D72662176](https://our.internmc.facebook.com/intern/diff/D72662176/)
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -260,6 +260,26 @@ vkapi::VulkanImage allocate_image(
       return vkapi::VulkanImage();
   }
 
+    // TODO(ssjia): change to always check that the image extents do not exceed
+    // physical limits. Adding the check now based on `maxImageDimension3D` will
+    // cause some existing models to break. Anecdotally, on Adreno and
+    // SwiftShader devices, using 3D textures that exceed `maxImageDimension3D`
+    // appears to be ok. So we need to figure out if is it undefined behaviour
+    // or if there's a better way to figure out what the limit is. For now, only
+    // check during debug build so that we can detect when exceeding physical
+    // limits could be a potential cause for model outputs to be wrong. In the
+    // meantime, the threshold for using texture storage can be configured at
+    // export time.
+#ifdef VULKAN_DEBUG
+  uint32_t max_extent = storage_type == utils::kTexture3D
+      ? adapter_ptr->max_texture3d_dim()
+      : adapter_ptr->max_texture2d_dim();
+
+  VK_CHECK_COND(
+      image_extents[0] <= max_extent && image_extents[1] <= max_extent &&
+      image_extents[2] <= max_extent);
+#endif
+
   VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
 
   return adapter_ptr->vma().create_image(
@@ -291,6 +311,8 @@ vkapi::VulkanBuffer allocate_buffer(
       return vkapi::VulkanBuffer();
   }
 
+  VK_CHECK_COND(numel <= context_ptr->adapter_ptr()->max_buffer_numel());
+
   return adapter_ptr->vma().create_storage_buffer(
       element_size(dtype) * numel, allocate_memory);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
@@ -109,8 +109,8 @@ void main() {
       in_vals[r][0] = get_first(in_val_packed);
       in_vals[r][1] = get_second(in_val_packed);
     } else {
-      in_vals[r][0] = uint8_t(254);
-      in_vals[r][1] = uint8_t(254);
+      in_vals[r][0] = uint8_t(0);
+      in_vals[r][1] = uint8_t(0);
     }
   }
 
@@ -131,6 +131,6 @@ void main() {
     t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1;
     t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2;
   $else:
-    imageStore(t_qmat2, ivec3(packed_pos.xy, 0), out_tex_1);
-    imageStore(t_qmat2, ivec3(packed_pos.x, packed_pos.y + 1, 0), out_tex_2);
+    imageStore(t_qmat2, packed_pos.xy, out_tex_1);
+    imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2);
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
@@ -6,8 +6,10 @@
 
 pack_int4_linear_weight_transposed_interleaved:
   parameter_names_with_default_values:
-    STORAGE: texture3d
+    STORAGE: texture2d
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: texture2d
+      - VALUE: buffer
   shader_variants:
-    - NAME: pack_int4_linear_weight_transposed_interleaved_texture3d
-    - NAME: pack_int4_linear_weight_transposed_interleaved_buffer
-      STORAGE: buffer
+    - NAME: pack_int4_linear_weight_transposed_interleaved
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
@@ -21,7 +21,7 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_mat1", DTYPE, IN_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "texture3D")}
+${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)}
 
 layout(push_constant) uniform restrict Block {
   ivec4 out_sizes;
@@ -79,13 +79,23 @@ void main() {
 
   $if WEIGHT_STORAGE == "buffer":
     const int qmat2_stride = qmat2_sizes.x >> 2;
+  $if PARAMS_STORAGE == "buffer":
+    const int qparams_y_stride = out_sizes.x >> 2;
+    const int qparams_z_stride = qparams_y_stride * 2;
 
   for (int block_idx = 0; block_idx < num_blocks; ++block_idx) {
-    scales[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 0, block_idx), 0);
-    zeros[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 1, block_idx), 0);
+    $if PARAMS_STORAGE == "buffer":
+      scales[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx];
+      zeros[0] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + qparams_y_stride];
 
-    scales[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 0, block_idx), 0);
-    zeros[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 1, block_idx), 0);
+      scales[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1];
+      zeros[1] = t_qparams[block_idx * qparams_z_stride + out_col_texel_idx + 1 + qparams_y_stride];
+    $else:
+      scales[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 0, block_idx), 0);
+      zeros[0] = texelFetch(t_qparams, ivec3(out_col_texel_idx, 1, block_idx), 0);
+
+      scales[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 0, block_idx), 0);
+      zeros[1] = texelFetch(t_qparams, ivec3(out_col_texel_idx + 1, 1, block_idx), 0);
 
     for (int g_idx = 0; g_idx < group_size; g_idx += 4) {
       const int k = block_idx * group_size + g_idx;
@@ -101,7 +111,7 @@ void main() {
         $else:
           const uvec4 packed_weight_tex = texelFetch(
               t_qmat2,
-              ivec3(gl_GlobalInvocationID.x, k + comp, 0),
+              ivec2(gl_GlobalInvocationID.x, k + comp),
               0);
 
         const uvec4 weight_tex_1 = (packed_weight_tex & 0xF0) >> 4;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml
@@ -9,12 +9,11 @@ q_4w_linear:
     DTYPE: float
     OUT_STORAGE: texture3d
     IN_STORAGE: texture3d
-    WEIGHT_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+    PARAMS_STORAGE: buffer
   shader_variants:
-    - NAME: q_4w_linear_texture3d_texture3d_texture3d_float
-    - NAME: q_4w_linear_texture3d_buffer_texture3d_float
-      IN_STORAGE: buffer
-    - NAME: q_4w_linear_buffer_buffer_texture3d_float
+    - NAME: q_4w_linear_texture3d_texture3d_texture2d_float
+    - NAME: q_4w_linear_buffer_buffer_texture2d_float
       OUT_STORAGE: buffer
       IN_STORAGE: buffer
     - NAME: q_4w_linear_buffer_buffer_buffer_float
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp
@@ -83,10 +83,9 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
   const int64_t N = qmat2_orig_sizes.at(ndim - 2);
   const int64_t N_div2 = N / int64_t(2);
 
-  utils::StorageType storage_type = utils::kTexture3D;
-  utils::uvec3 max_extents =
-      graph.context()->adapter_ptr()->max_texture_extents();
-  if (N_div2 > max_extents[0] * 4 || K > max_extents[1]) {
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (N_div2 > max_extent * 4 || K > max_extent) {
     storage_type = utils::kBuffer;
   }
 
@@ -133,7 +132,7 @@ void add_q_4w_linear_node(
       prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data);
 
   ValueRef scales_and_zeros = prepack_standard_hw_transposed(
-      graph, scales_and_zeros_data, utils::kTexture3D, utils::kWidthPacked);
+      graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked);
 
   std::string kernel_name = "q_4w_linear";
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -211,11 +211,16 @@ class Adapter final {
     return physical_device_.min_ubo_alignment;
   }
 
-  inline utils::uvec3 max_texture_extents() const {
-    return {
-        physical_device_.properties.limits.maxImageDimension1D,
-        physical_device_.properties.limits.maxImageDimension2D,
-        physical_device_.properties.limits.maxImageDimension3D};
+  inline uint32_t max_texture2d_dim() const {
+    return physical_device_.properties.limits.maxImageDimension2D;
+  }
+
+  inline uint32_t max_texture3d_dim() const {
+    return physical_device_.properties.limits.maxImageDimension3D;
+  }
+
+  inline uint32_t max_buffer_numel() const {
+    return physical_device_.properties.limits.maxStorageBufferRange;
   }
 
   // Command Buffer Submission

Original file line number	Diff line number	Diff line change
`@@ -109,8 +109,8 @@ void main() {`
`109`	`109`	`in_vals[r][0] = get_first(in_val_packed);`
`110`	`110`	`in_vals[r][1] = get_second(in_val_packed);`
`111`	`111`	`} else {`
`112`		`- in_vals[r][0] = uint8_t(254);`
`113`		`- in_vals[r][1] = uint8_t(254);`
	`112`	`+ in_vals[r][0] = uint8_t(0);`
	`113`	`+ in_vals[r][1] = uint8_t(0);`
`114`	`114`	`}`
`115`	`115`	`}`
`116`	`116`
`@@ -131,6 +131,6 @@ void main() {`
`131`	`131`	`t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1;`
`132`	`132`	`t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2;`
`133`	`133`	`$else:`
`134`		`- imageStore(t_qmat2, ivec3(packed_pos.xy, 0), out_tex_1);`
`135`		`- imageStore(t_qmat2, ivec3(packed_pos.x, packed_pos.y + 1, 0), out_tex_2);`
	`134`	`+ imageStore(t_qmat2, packed_pos.xy, out_tex_1);`
	`135`	`+ imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2);`
`136`	`136`	`}`