[ET-VK] Making stride equals dilation the default mode for conv2d dw. (#7629)

pytorchbot · trivedivivek · web-flow · commit cf8d0cf8ebd9 · 2025-01-13T17:51:05.000-06:00
* [ET-VK] Fixing conv2d dw incorrect output when stride != dilation issue. Pull Request resolved: #7595 This diff moves current implementation of conv2d dw as a special case when stride equals dilation in the Vulkan backend of Executorch, since that's the only time this kind of caching is possible. If stride does not equal dilation the old implementation is used. Additional test cases are added to ensure computation is correct when stride != dilation. ghstack-source-id: 261183385 @exported-using-ghexport Differential Revision: [D67908916](https://our.internmc.facebook.com/intern/diff/D67908916/) * [ET-VK] Making stride equals dilation the default mode for conv2d dw. Pull Request resolved: #7596 This diff makes changes make stride equals dilation the default mode for conv2d dw output op. Adds a different source file to handle stride not equal dilation case. ghstack-source-id: 261183386 Differential Revision: [D67979760](https://our.internmc.facebook.com/intern/diff/D67979760/) * Update conv2d_dw_output_tile.glsl --------- Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -43,7 +43,6 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * output at a single output location.
  */
 
-#if STRIDE_EQ_DILATION
 void main() {
   // x and y are divided by batch size to determine 3d position
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
@@ -125,42 +124,3 @@ void main() {
     }
   }
 }
-
-#else
-void main() {
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x % out_limits.y,
-    div_by_x / out_limits.y);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so any reads from the padding region is skipped.
-  const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
-
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  int kx = 0;
-  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
-    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
-      // The weight kernel was rearranged such that every NxN filter is
-      // flattened to fit in one row. Each filter was then stacked on top of
-      // each other vertically.
-      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
-      kx++;
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#define TILE_SIZE ${TILE_SIZE}
+
+#define op(X, A, B) ${OPERATOR}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+${layout_declare_ubo(4, "ivec3", "out_limits")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
+${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
+${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a depthwise convolution. Each shader invocation calculates the
+ * output at a single output location.
+ */
+
+void main() {
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
+  const ivec3 pos = ivec3(
+    gl_GlobalInvocationID.x % out_limits.x,
+    div_by_x % out_limits.y,
+    div_by_x / out_limits.y);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + overlay_region.xy;
+
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
+    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
+      kx++;
+    }
+  }
+
+  imageStore(t_out, pos, op(sum, out_min, out_max));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw_sned_output_tile:
+  parameter_names_with_default_values:
+    OPERATOR: X
+    NDIM: 3
+    DTYPE: float
+    TILE_SIZE: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_dw_sned_output_tile_3x3
+    - NAME: conv2d_dw_sned_output_tile_3x3_clamp
+      OPERATOR: clamp(X, A, B)
+    - NAME: conv2d_dw_sned_output_tile_5x5
+      TILE_SIZE: 5
+    - NAME: conv2d_dw_sned_output_tile_5x5_clamp
+      OPERATOR: clamp(X, A, B)
+      TILE_SIZE: 5
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -134,8 +134,8 @@ vkapi::ShaderInfo get_conv2d_shader(
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
       if (!prepack_weights) {
-        if (stride_equals_dilation) {
-          kernel_name += "_sed";
+        if (!stride_equals_dilation) {
+          kernel_name += "_sned";
         }
         const auto& weight_sizes = graph.get_tref(weight)->sizes;
         if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {