From 134ff609f5a5f2417f6bb79285bee7637b335788 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:18:19 -0700 Subject: [PATCH] [ET-VK] Adding all tensor packing support to split op. This diff updates Executorch Vulkan backend's `split` operation to support width, height and channel packed tensors. It also updates the op_registry.py file to indicate `split` operation supports all packing and adds new test cases to the cases.py file to test the operation. Differential Revision: [D71345589](https://our.internmc.facebook.com/intern/diff/D71345589/) [ghstack-poisoned] --- backends/vulkan/op_registry.py | 4 +- .../vulkan/runtime/graph/ops/impl/Split.cpp | 90 +++++++------- backends/vulkan/test/op_tests/cases.py | 115 ++++++++++-------- 3 files changed, 111 insertions(+), 98 deletions(-) diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py index f2b80c2e544..5aa805dc1b3 100644 --- a/backends/vulkan/op_registry.py +++ b/backends/vulkan/op_registry.py @@ -528,8 +528,6 @@ def register_view_op(features: OpFeatures): exir_ops.edge.aten.index_select.default, exir_ops.edge.aten.select_copy.int, # Tensor combination - exir_ops.edge.aten.split_with_sizes_copy.default, - exir_ops.edge.aten.split.Tensor, exir_ops.edge.aten.repeat.default, # Tensor creation exir_ops.edge.aten.arange.start_step, @@ -563,6 +561,8 @@ def register_ported_op(features: OpFeatures): exir_ops.edge.aten.permute_copy.default, # Tensor combination exir_ops.edge.aten.cat.default, + exir_ops.edge.aten.split_with_sizes_copy.default, + exir_ops.edge.aten.split.Tensor, ] ) def register_ported_op_all_packed_dims(features: OpFeatures): diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp index b74317b078e..8002dadc538 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Split.cpp @@ -25,8 +25,6 @@ void add_split_with_sizes_default_node( ValueRef out_list_ref) { vTensorPtr t_in = graph.get_tensor(in); - VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim)); - ValueListPtr out_list = graph.get_value_list(out_list_ref); DimIndex dim_index = normalize_to_dim_index(*t_in, dim); @@ -38,62 +36,60 @@ void add_split_with_sizes_default_node( ValueRef out_ref = (*out_list)[split_idx]; vTensorPtr t_out = graph.get_tensor(out_ref); - VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim)); VK_CHECK_COND(dim_at(*t_out, dim_index) == split_size); } - if (dim_index == kWidth4D) { - utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false); - utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false); + const auto packed_dim = t_in->packed_dim(); + const auto packed_dim_index = static_cast(kWidth4D - packed_dim); - for (ValueRef out_ref : *out_list) { - // Doesn't need to use split_size since we have already verified that the - // output tensor's size matches with the split_size. - vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->logical_limits(); - add_copy_offset_node( - graph, in, range, src_offset, dst_offset, out_ref, false, true); + // Index of dimension to be concatenated in (w, h, c * b) coordinate system + const auto dim_xyz_index = std::min(2, -dim_index - 1); - src_offset[0] += range[0]; - } - } else if (dim_index == kHeight4D) { - utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false); - utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false); + utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false); + utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false); - for (ValueRef out_ref : *out_list) { - vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->logical_limits(); - add_copy_offset_node( - graph, in, range, src_offset, dst_offset, out_ref, false, true); + const bool is_splitting_channel = (dim_index == kChannel4D); - src_offset[1] += range[1]; - } - } else if (dim_index == kBatch4D) { - utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false); - utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false); + // if splitting channels + if (is_splitting_channel) { + // set source offset w as channel size of the input tensor + src_offset[3] = dim_at(t_in->sizes(), kChannel4D); + } - for (ValueRef out_ref : *out_list) { - vTensorPtr t_out = graph.get_tensor(out_ref); - utils::ivec3 range = t_out->logical_limits(); + for (ValueRef out_ref : *out_list) { + // Doesn't need to use split_size since we have already verified that the + // output tensor's size matches with the split_size. + vTensorPtr t_out = graph.get_tensor(out_ref); + const auto out_channel_size = dim_at(t_out->sizes(), kChannel4D); + utils::ivec3 range = t_out->logical_limits(); + + if (dim_index == packed_dim_index) { + // if splitting channels, use add_copy_channel_offset_node function as + // add_copy_packed_dim_offset_node does not support channel packing + if (is_splitting_channel) { + add_copy_channel_offset_node( + graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref); + src_offset[dim_xyz_index] += out_channel_size; + } else { + // dst_offset[3] is not used now but will be used in the future when + // add_copy_packed_dim_offset_node will support channel packing + // + // set destination offset w as channel size of the output tensor if + // splitting channel + dst_offset[3] = is_splitting_channel ? out_channel_size : 0; + add_copy_packed_dim_offset_node( + graph, in, range, src_offset, dst_offset, out_ref); + src_offset[dim_xyz_index] += dim_at(t_out->sizes(), packed_dim_index); + } + } else { + // set destination offset w as channel size of the output tensor if + // splitting channels + dst_offset[3] = is_splitting_channel ? out_channel_size : 0; add_copy_offset_node( graph, in, range, src_offset, dst_offset, out_ref, false, true); - - src_offset[2] += range[2]; - } - } else if (dim_index == kChannel4D) { - int32_t src_offset = 0; - int32_t dst_offset = 0; - - for (ValueRef out_ref : *out_list) { - vTensorPtr t_out = graph.get_tensor(out_ref); - int32_t range = dim_at(t_out->sizes()); - add_copy_channel_offset_node( - graph, in, range, src_offset, dst_offset, out_ref); - src_offset += range; + src_offset[dim_xyz_index] += + is_splitting_channel ? out_channel_size : range[dim_xyz_index]; } - - } else { - VK_THROW("not ipmlemented"); } } diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index e4f7ac15434..20ed5e8e4d6 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -922,14 +922,20 @@ def get_split_with_sizes_inputs(): Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"]) test_cases = [ # Split on Width + Test(self=(S1, 7, 10, 11), sizes=[1, 3, 3, 5], dim=3), Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3), + Test(self=(7, 10, 11), sizes=[1, 3, 3, 5], dim=2), Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2), + Test(self=(7, 10, 11), sizes=[3, 8], dim=2), Test(self=(7, 10, 10), sizes=[1, 9], dim=2), Test(self=(10, 10), sizes=[1, 9], dim=1), Test(self=(10,), sizes=[1, 9], dim=0), # Split on Height + Test(self=(S1, 7, 11, 10), sizes=[1, 3, 3, 5], dim=2), Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2), + Test(self=(7, 11, 10), sizes=[1, 3, 3, 5], dim=1), Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1), + Test(self=(7, 11, 11), sizes=[3, 8], dim=1), Test(self=(7, 10, 10), sizes=[10], dim=1), Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1), Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0), @@ -937,8 +943,11 @@ def get_split_with_sizes_inputs(): Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0), Test(self=(10, 7, 10, 10), sizes=[10], dim=0), # Split on Channel + Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1), Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1), + Test(self=(7, 13, 4, 8), sizes=[3, 3, 2, 5, 1], dim=1), Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1), + Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0), Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0), Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0), Test(self=(13, 4, 8), sizes=[13], dim=0), @@ -946,6 +955,8 @@ def get_split_with_sizes_inputs(): test_suite = VkTestSuite([tuple(tc) for tc in test_cases]) test_suite.layouts = [ + "utils::kWidthPacked", + "utils::kHeightPacked", "utils::kChannelsPacked", ] test_suite.data_gen = "make_seq_tensor" @@ -953,55 +964,61 @@ def get_split_with_sizes_inputs(): return test_suite -@register_test_suite("aten.split.Tensor") -def get_split_tensor_inputs(): - test_suite = VkTestSuite( - [ - # Split on Width - ((S1, 7, 10, 12), 12, 3), - ((S1, 7, 10, 12), 3, 3), - ((S1, 7, 10, 12), 1, 3), - ((7, 10, 12), 12, 2), - ((7, 10, 12), 3, 2), - ((7, 10, 12), 1, 2), - ((10, 12), 12, 1), - ((10, 12), 3, 1), - ((10, 12), 1, 1), - ((12,), 12, 0), - ((12,), 3, 0), - ((12,), 1, 0), - # Split on Height - ((S1, 7, 12, 8), 12, 2), - ((S1, 7, 12, 8), 3, 2), - ((S1, 7, 12, 8), 1, 2), - ((7, 12, 8), 12, 1), - ((7, 12, 8), 3, 1), - ((7, 12, 8), 1, 1), - ((12, 8), 12, 0), - ((12, 8), 3, 0), - ((12, 8), 1, 0), - # Split on Batch - ((12, 7, 10, 10), 12, 0), - ((12, 7, 10, 10), 3, 0), - ((12, 7, 10, 10), 1, 0), - # Split on Channel - ((7, 15, 10, 10), 15, 1), - ((7, 15, 10, 10), 5, 1), - ((7, 15, 10, 10), 3, 1), - ((7, 15, 10, 10), 1, 1), - ((15, 10, 10), 15, 0), - ((15, 10, 10), 5, 0), - ((15, 10, 10), 3, 0), - ((15, 10, 10), 1, 0), - ] - ) - - test_suite.layouts = [ - "utils::kChannelsPacked", - ] - test_suite.data_gen = "make_seq_tensor" - test_suite.dtypes = ["at::kFloat"] - return test_suite +# @register_test_suite("aten.split.Tensor") +# def get_split_tensor_inputs(): +# test_suite = VkTestSuite( +# [ +# # Split on Width +# ((M1, 7, 10, 12), 12, 3), +# ((S1, 7, 10, 12), 12, 3), +# ((M1, 7, 10, 12), 3, 3), +# ((S1, 7, 10, 12), 3, 3), +# ((M1, 7, 10, 12), 1, 3), +# ((S1, 7, 10, 12), 1, 3), +# ((7, 10, 12), 12, 2), +# ((7, 10, 12), 3, 2), +# ((7, 10, 12), 1, 2), +# ((2, 3, 4), 1, 2), +# ((10, 12), 12, 1), +# ((10, 12), 3, 1), +# ((10, 12), 1, 1), +# ((12,), 12, 0), +# ((12,), 3, 0), +# ((12,), 1, 0), +# # Split on Height +# ((S1, 7, 12, 8), 12, 2), +# ((S1, 7, 12, 8), 3, 2), +# ((S1, 7, 12, 8), 1, 2), +# ((7, 12, 8), 12, 1), +# ((7, 12, 8), 3, 1), +# ((7, 12, 8), 1, 1), +# ((12, 8), 12, 0), +# ((12, 8), 3, 0), +# ((12, 8), 1, 0), +# # Split on Batch +# ((12, 7, 10, 10), 12, 0), +# ((12, 7, 10, 10), 3, 0), +# ((12, 7, 10, 10), 1, 0), +# # Split on Channel +# ((7, 15, 10, 10), 15, 1), +# ((7, 15, 10, 10), 5, 1), +# ((7, 15, 10, 10), 3, 1), +# ((7, 15, 10, 10), 1, 1), +# ((15, 10, 10), 15, 0), +# ((15, 10, 10), 5, 0), +# ((15, 10, 10), 3, 0), +# ((15, 10, 10), 1, 0), +# ] +# ) + +# test_suite.layouts = [ +# "utils::kWidthPacked", +# "utils::kHeightPacked", +# "utils::kChannelsPacked", +# ] +# test_suite.data_gen = "make_seq_tensor" +# test_suite.dtypes = ["at::kFloat"] +# return test_suite def get_reduce_inputs(is_softmax: bool = False):