[ET-VK] Adding all tensor packing support to split op. (pytorch#9439)

pytorchbot · trivedivivek · oscarandersson8218 · commit 818f55bfa454 · 2025-03-21T13:28:28.000+01:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: pytorch#9345 by @trivedivivek ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/trivedivivek/66/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/trivedivivek/66/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/trivedivivek/65/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/trivedivivek/66/orig @diff-train-skip-merge --------- Co-authored-by: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -528,8 +528,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
         # Tensor combination
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
         exir_ops.edge.aten.repeat.default,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
@@ -563,6 +561,8 @@ def register_ported_op(features: OpFeatures):
         exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.split_with_sizes_copy.default,
+        exir_ops.edge.aten.split.Tensor,
     ]
 )
 def register_ported_op_all_packed_dims(features: OpFeatures):
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -25,8 +25,6 @@ void add_split_with_sizes_default_node(
     ValueRef out_list_ref) {
   vTensorPtr t_in = graph.get_tensor(in);
 
-  VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
-
   ValueListPtr out_list = graph.get_value_list(out_list_ref);
 
   DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
@@ -38,62 +36,60 @@ void add_split_with_sizes_default_node(
     ValueRef out_ref = (*out_list)[split_idx];
 
     vTensorPtr t_out = graph.get_tensor(out_ref);
-    VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
     VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size);
   }
 
-  if (dim_index == kWidth4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  const auto packed_dim = t_in->packed_dim();
+  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
 
-    for (ValueRef out_ref : *out_list) {
-      // Doesn't need to use split_size since we have already verified that the
-      // output tensor's size matches with the split_size.
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
+  const auto dim_xyz_index = std::min(2, -dim_index - 1);
 
-      src_offset[0] += range[0];
-    }
-  } else if (dim_index == kHeight4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
+  const bool is_splitting_channel = (dim_index == kChannel4D);
 
-      src_offset[1] += range[1];
-    }
-  } else if (dim_index == kBatch4D) {
-    utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-    utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
+  // if splitting channels
+  if (is_splitting_channel) {
+    // set source offset w as channel size of the input tensor
+    src_offset[3] = dim_at(t_in->sizes(), kChannel4D);
+  }
 
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      utils::ivec3 range = t_out->logical_limits();
+  for (ValueRef out_ref : *out_list) {
+    // Doesn't need to use split_size since we have already verified that the
+    // output tensor's size matches with the split_size.
+    vTensorPtr t_out = graph.get_tensor(out_ref);
+    const auto out_channel_size = dim_at(t_out->sizes(), kChannel4D);
+    utils::ivec3 range = t_out->logical_limits();
+
+    if (dim_index == packed_dim_index) {
+      // if splitting channels, use add_copy_channel_offset_node function as
+      // add_copy_packed_dim_offset_node does not support channel packing
+      if (is_splitting_channel) {
+        add_copy_channel_offset_node(
+            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
+        src_offset[dim_xyz_index] += out_channel_size;
+      } else {
+        // dst_offset[3] is not used now but will be used in the future when
+        // add_copy_packed_dim_offset_node will support channel packing
+        //
+        // set destination offset w as channel size of the output tensor if
+        // splitting channel
+        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
+        add_copy_packed_dim_offset_node(
+            graph, in, range, src_offset, dst_offset, out_ref);
+        src_offset[dim_xyz_index] += dim_at(t_out->sizes(), packed_dim_index);
+      }
+    } else {
+      // set destination offset w as channel size of the output tensor if
+      // splitting channels
+      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
       add_copy_offset_node(
           graph, in, range, src_offset, dst_offset, out_ref, false, true);
-
-      src_offset[2] += range[2];
-    }
-  } else if (dim_index == kChannel4D) {
-    int32_t src_offset = 0;
-    int32_t dst_offset = 0;
-
-    for (ValueRef out_ref : *out_list) {
-      vTensorPtr t_out = graph.get_tensor(out_ref);
-      int32_t range = dim_at<kChannel4D>(t_out->sizes());
-      add_copy_channel_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref);
-      src_offset += range;
+      src_offset[dim_xyz_index] +=
+          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
     }
-
-  } else {
-    VK_THROW("not ipmlemented");
   }
 }
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -922,30 +922,41 @@ def get_split_with_sizes_inputs():
     Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"])
     test_cases = [
         # Split on Width
+        Test(self=(S1, 7, 10, 11), sizes=[1, 3, 2, 5], dim=3),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3),
+        Test(self=(7, 10, 11), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 10, 11), sizes=[3, 8], dim=2),
         Test(self=(7, 10, 10), sizes=[1, 9], dim=2),
         Test(self=(10, 10), sizes=[1, 9], dim=1),
         Test(self=(10,), sizes=[1, 9], dim=0),
         # Split on Height
+        Test(self=(S1, 7, 11, 10), sizes=[1, 3, 2, 5], dim=2),
         Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 11, 10), sizes=[1, 3, 2, 5], dim=1),
         Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1),
+        Test(self=(7, 11, 11), sizes=[3, 8], dim=1),
         Test(self=(7, 10, 10), sizes=[10], dim=1),
         Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1),
         Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0),
         # Split on Batch
         Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0),
         Test(self=(10, 7, 10, 10), sizes=[10], dim=0),
         # Split on Channel
+        Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1),
+        Test(self=(7, 13, 4, 8), sizes=[3, 2, 2, 5, 1], dim=1),
         Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1),
+        Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0),
         Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0),
         Test(self=(13, 4, 8), sizes=[13], dim=0),
     ]
     test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"
@@ -997,6 +1008,8 @@ def get_split_tensor_inputs():
     )
 
     test_suite.layouts = [
+        "utils::kWidthPacked",
+        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
     test_suite.data_gen = "make_seq_tensor"